{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"classification_results.ipynb","provenance":[],"collapsed_sections":["YUI96qV08o_W"],"authorship_tag":"ABX9TyO/LWDFrtl0loYxr0PdTO3j"},"kernelspec":{"display_name":"Python 3","name":"python3"}},"cells":[{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"H_7dKWiaER2n","executionInfo":{"status":"ok","timestamp":1635282408865,"user_tz":240,"elapsed":21922,"user":{"displayName":"Stefano Dantas","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GipFPGThxoPqDdWN5GpM_HhBEYftAKub2QaGtlITg=s64","userId":"10187781003309575652"}},"outputId":"a51b2b2f-88f6-4a3d-dd12-ad5d9f3bd451"},"source":["from google.colab import drive\n","drive.mount('/content/gdrive')\n","%cd 'gdrive/MyDrive/SouthAfrica/'"],"execution_count":1,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/gdrive\n","/content/gdrive/MyDrive/SouthAfrica\n"]}]},{"cell_type":"code","metadata":{"id":"idLGN6KHEtY2","executionInfo":{"status":"ok","timestamp":1635283149415,"user_tz":240,"elapsed":659,"user":{"displayName":"Stefano Dantas","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GipFPGThxoPqDdWN5GpM_HhBEYftAKub2QaGtlITg=s64","userId":"10187781003309575652"}}},"source":["import pandas as pd\n","import numpy as np\n","import nltk\n","from nltk.corpus import stopwords\n","from nltk.stem.snowball import SnowballStemmer\n","from sklearn.feature_extraction.text import TfidfVectorizer\n","from sklearn.model_selection import train_test_split, cross_val_score\n","from sklearn.pipeline import make_pipeline\n","import re\n","from typing import List, Tuple, Dict\n","from sklearn.preprocessing import MaxAbsScaler\n","from sklearn.linear_model import RidgeClassifier, LogisticRegression\n","from sklearn.ensemble import RandomForestClassifier\n","import xgboost as xgb\n","\n","from sklearn import metrics\n","from tqdm import tqdm\n","import pickle\n","\n","\n","tqdm.pandas()"],"execution_count":18,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"YUI96qV08o_W"},"source":["### Classification"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"dMW8N8UU8zvS","executionInfo":{"status":"ok","timestamp":1635282555287,"user_tz":240,"elapsed":617,"user":{"displayName":"Stefano Dantas","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GipFPGThxoPqDdWN5GpM_HhBEYftAKub2QaGtlITg=s64","userId":"10187781003309575652"}},"outputId":"fa5c0b37-f3ec-4225-e1cc-4eadcf73cecf"},"source":["nltk.download('stopwords')\n","nltk.download('punkt')\n","stop_words = stopwords.words('english')\n","stemmer = SnowballStemmer(\"english\")"],"execution_count":3,"outputs":[{"output_type":"stream","name":"stdout","text":["[nltk_data] Downloading package stopwords to /root/nltk_data...\n","[nltk_data]   Unzipping corpora/stopwords.zip.\n","[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data]   Unzipping tokenizers/punkt.zip.\n"]}]},{"cell_type":"code","metadata":{"id":"dvOQ_84b8oqJ","executionInfo":{"status":"ok","timestamp":1635282556771,"user_tz":240,"elapsed":246,"user":{"displayName":"Stefano Dantas","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GipFPGThxoPqDdWN5GpM_HhBEYftAKub2QaGtlITg=s64","userId":"10187781003309575652"}}},"source":["def removeStopWords(sentence:str):\n","    tokenized = nltk.word_tokenize(sentence)\n","    temp_list = [word for word in tokenized if word not in stop_words]\n","    return  ' '.join(temp_list)\n","\n","def stemming(sentence:str):\n","    stemSentence = \"\"\n","    for word in sentence.split():\n","        stem = stemmer.stem(word)\n","        stemSentence += stem + \" \"\n","    stemSentence = stemSentence.strip()\n","    return stemSentence\n","\n","def keep_only_letters(x:str):\n","    pattern = r'[^a-zA-Z]+'\n","    text = re.sub(pattern, ' ', x)\n","    return text\n","\n","def clean_text(x:str):\n","    x = x.lower()\n","    x = keep_only_letters(x)\n","    x = removeStopWords(x)\n","    x = stemming(x)\n","\n","    return x"],"execution_count":4,"outputs":[]},{"cell_type":"code","metadata":{"id":"DTFYP7Jb9EJB","colab":{"base_uri":"https://localhost:8080/","height":204},"executionInfo":{"status":"ok","timestamp":1635282566798,"user_tz":240,"elapsed":6364,"user":{"displayName":"Stefano Dantas","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GipFPGThxoPqDdWN5GpM_HhBEYftAKub2QaGtlITg=s64","userId":"10187781003309575652"}},"outputId":"a8006780-6899-41e5-d2ba-deac347a2849"},"source":["text = pd.read_csv('no_tweet_text.csv')\n","text.head()"],"execution_count":5,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>doc_id</th>\n","      <th>text</th>\n","      <th>category</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>5320a171a8a0b805134c755f</td>\n","      <td>Caxton's local bachelor competition proudly pr...</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>5320a173a8a0b805134c7560</td>\n","      <td>Caxton's local bachelor competition proudly pr...</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>5320a175a8a0b805134c7561</td>\n","      <td>Last Wednesday MMC Clr Bennett Nikani was in T...</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>5320a176a8a0b805134c7562</td>\n","      <td>Mareike (Lika Berning) is a city girl and succ...</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>5320a177a8a0b805134c7563</td>\n","      <td>Ip Man's peaceful life in Foshan changes after...</td>\n","      <td>NaN</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>"],"text/plain":["                     doc_id  ... category\n","0  5320a171a8a0b805134c755f  ...      NaN\n","1  5320a173a8a0b805134c7560  ...      NaN\n","2  5320a175a8a0b805134c7561  ...      NaN\n","3  5320a176a8a0b805134c7562  ...      NaN\n","4  5320a177a8a0b805134c7563  ...      NaN\n","\n","[5 rows x 3 columns]"]},"metadata":{},"execution_count":5}]},{"cell_type":"code","metadata":{"id":"ihPXyaom9NCN","executionInfo":{"status":"ok","timestamp":1635282569338,"user_tz":240,"elapsed":186,"user":{"displayName":"Stefano Dantas","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GipFPGThxoPqDdWN5GpM_HhBEYftAKub2QaGtlITg=s64","userId":"10187781003309575652"}}},"source":["test_data = text[text.category.isna()].copy()\n","train_data = text[~text.category.isna()].copy()"],"execution_count":6,"outputs":[]},{"cell_type":"code","metadata":{"id":"zRvSCB5A9dBz","executionInfo":{"status":"ok","timestamp":1635282573192,"user_tz":240,"elapsed":942,"user":{"displayName":"Stefano Dantas","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GipFPGThxoPqDdWN5GpM_HhBEYftAKub2QaGtlITg=s64","userId":"10187781003309575652"}}},"source":["test_data = test_data.drop_duplicates(['text'])"],"execution_count":7,"outputs":[]},{"cell_type":"code","metadata":{"id":"WfxIMW9t9eVK","executionInfo":{"status":"ok","timestamp":1635282585300,"user_tz":240,"elapsed":160,"user":{"displayName":"Stefano Dantas","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GipFPGThxoPqDdWN5GpM_HhBEYftAKub2QaGtlITg=s64","userId":"10187781003309575652"}}},"source":["train_data['category'] = train_data['category'].map({'Non-election':0,\n","                                                     'Broad':1,\n","                                                     'Narrow':2})"],"execution_count":8,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ioq1FFuPcDFe","executionInfo":{"status":"ok","timestamp":1635282603144,"user_tz":240,"elapsed":5085,"user":{"displayName":"Stefano Dantas","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GipFPGThxoPqDdWN5GpM_HhBEYftAKub2QaGtlITg=s64","userId":"10187781003309575652"}},"outputId":"b2709cf6-4200-4c85-b60a-33cbe9b713d6"},"source":["X_train  = train_data.text.progress_apply(clean_text)\n","y_train = train_data.category"],"execution_count":9,"outputs":[{"output_type":"stream","name":"stderr","text":["100%|██████████| 900/900 [00:04<00:00, 184.86it/s]\n"]}]},{"cell_type":"code","metadata":{"id":"aBPK5E9dcGsa","executionInfo":{"status":"ok","timestamp":1635283644007,"user_tz":240,"elapsed":140,"user":{"displayName":"Stefano Dantas","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GipFPGThxoPqDdWN5GpM_HhBEYftAKub2QaGtlITg=s64","userId":"10187781003309575652"}}},"source":["def get_fold_results(classifier_dict:dict, X_train: pd.Series, y_train: pd.Series):\n","\n","    for clf_name, clf in classifier_dict.items():\n","        vect = TfidfVectorizer()\n","        pipe = make_pipeline(vect,clf)\n","        scores = cross_val_score(pipe, X_train, y_train, cv=10)\n","        print(f\"10-Fold results {clf_name} accuracy = {scores.mean():.2f}\")"],"execution_count":45,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"0fRwfEojcRxK","executionInfo":{"status":"ok","timestamp":1635283821880,"user_tz":240,"elapsed":100308,"user":{"displayName":"Stefano Dantas","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GipFPGThxoPqDdWN5GpM_HhBEYftAKub2QaGtlITg=s64","userId":"10187781003309575652"}},"outputId":"ac418f3c-bee8-442b-dd49-99a20afa0be8"},"source":["classifier_dict = {'Ridge':RidgeClassifier(alpha=.1),\n","                   'Logistic':LogisticRegression(C=1.5),\n","                   'RandomForest':RandomForestClassifier(),\n","                   'XGBoost':xgb.XGBClassifier()}\n","\n","get_fold_results(classifier_dict, X_train, y_train) "],"execution_count":46,"outputs":[{"output_type":"stream","name":"stdout","text":["10-Fold results Ridge accuracy = 0.87\n","10-Fold results Logistic accuracy = 0.82\n","10-Fold results RandomForest accuracy = 0.81\n","10-Fold results XGBoost accuracy = 0.83\n"]}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"jVLqsLn999LF","executionInfo":{"status":"ok","timestamp":1620142107188,"user_tz":240,"elapsed":338983,"user":{"displayName":"Stefano Dantas","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiBFpnhdzoALojiPiNb6uNzWCRGxOOhsFyQEFH2YQ=s64","userId":"10187781003309575652"}},"outputId":"87b29146-58db-4ea2-8430-0669b7b4810d"},"source":["X_test = test_data.text.progress_apply(clean_text)\n"],"execution_count":null,"outputs":[{"output_type":"stream","text":["100%|██████████| 900/900 [00:04<00:00, 223.95it/s]\n","100%|██████████| 73987/73987 [05:34<00:00, 221.52it/s]\n"],"name":"stderr"}]},{"cell_type":"code","metadata":{"id":"Xu15e5NQ-_Qd"},"source":["vect = TfidfVectorizer()\n","clf = RidgeClassifier()\n","\n","pipe = make_pipeline(vect,clf)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"VP43tXCY_suj","executionInfo":{"status":"ok","timestamp":1620142124573,"user_tz":240,"elapsed":503,"user":{"displayName":"Stefano Dantas","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiBFpnhdzoALojiPiNb6uNzWCRGxOOhsFyQEFH2YQ=s64","userId":"10187781003309575652"}},"outputId":"7cf8a384-880b-487c-c0b1-b05b4371308a"},"source":["pipe.fit(X_train,y_train)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Pipeline(memory=None,\n","         steps=[('tfidfvectorizer',\n","                 TfidfVectorizer(analyzer='word', binary=False,\n","                                 decode_error='strict',\n","                                 dtype=<class 'numpy.float64'>,\n","                                 encoding='utf-8', input='content',\n","                                 lowercase=True, max_df=1.0, max_features=None,\n","                                 min_df=1, ngram_range=(1, 1), norm='l2',\n","                                 preprocessor=None, smooth_idf=True,\n","                                 stop_words=None, strip_accents=None,\n","                                 sublinear_tf=False,\n","                                 token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n","                                 tokenizer=None, use_idf=True,\n","                                 vocabulary=None)),\n","                ('ridgeclassifier',\n","                 RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True,\n","                                 fit_intercept=True, max_iter=None,\n","                                 normalize=False, random_state=None,\n","                                 solver='auto', tol=0.001))],\n","         verbose=False)"]},"metadata":{"tags":[]},"execution_count":32}]},{"cell_type":"code","metadata":{"id":"F-B0ssog_wkU"},"source":["y_pred = pipe.predict(X_test)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"iP5t2gQf_0f5","executionInfo":{"status":"ok","timestamp":1620142307062,"user_tz":240,"elapsed":270,"user":{"displayName":"Stefano Dantas","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GiBFpnhdzoALojiPiNb6uNzWCRGxOOhsFyQEFH2YQ=s64","userId":"10187781003309575652"}},"outputId":"d776ae59-47f0-4c5d-b737-6fc6b2947e4f"},"source":["test_data['category'] = y_pred"],"execution_count":null,"outputs":[{"output_type":"stream","text":["/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n","A value is trying to be set on a copy of a slice from a DataFrame.\n","Try using .loc[row_indexer,col_indexer] = value instead\n","\n","See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n","  \"\"\"Entry point for launching an IPython kernel.\n"],"name":"stderr"}]},{"cell_type":"code","metadata":{"id":"eWfjTcQ1AcVx"},"source":["predictions = test_data[test_data.category!=0].copy()\n","predictions['category'] = predictions['category'].map({1:'Broad',2:'Narrow'})"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"Yh7Qk8p4CbP-"},"source":["mask = (text.category == 'Broad') | (text.category == 'Narrow')\n","final_data = pd.concat((text[mask],predictions))"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"X9CF5_77DNXK"},"source":["final_data.to_csv('text_categories.csv')"],"execution_count":null,"outputs":[]}]}